Preparation
Genotyped data set preparation
First we define input and output path shortcuts.
input <- file.path(getwd(), "data", "Input")
output <- file.path(getwd(), "data", "Output")
Then we delete the the output folder and all the files and folders it contains from the previous run.
unlink(output, recursive = TRUE)
The following code chunk consists in the arguments that this R script can take from the user. “verbose” means whether to output to the terminal, “imputated” is if the program should expect an imputated dataset “EP” performs needed data correction before analysis in the Epistasis Project dataset and “ADNI” means if the input dataset also includes the ADNI dataset.
verbose <- FALSE
Then, we proceed to read and store the genotyped input data.
encoded_NA <- c("00", "", "???", "-9")
df <- read.csv2(file.path(input, "genotyped.csv"),
na.strings = encoded_NA)
df$DHFR_rs70991108_INDEL <- NULL # Deleting indel from the dataset
df$PPARG_rs709149 <- NULL # Genes with unclear genotyping were removed because of suspected error
df$MS4A4E_rs670139 <- NULL # Genes with unclear genotyping were removed because of suspected error
df <- df[rowSums(is.na(df)) != ncol(df), ] # Removing rows that are all NA's
str(df, list.len = 30)
## 'data.frame': 2441 obs. of 97 variables:
## $ ID : chr "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
## $ Plate : chr "AST5" "AST5" "AST5" "AST5" ...
## $ WELL : chr "B01" "B03" "D01" "B05" ...
## $ CentreCODE : chr "RG 728" "RG 733" "RG 606" "RG 743" ...
## $ Diag : chr "AD" "AD" "AD" "AD" ...
## $ SexLett : chr "Male" "Female" "Male" "Female" ...
## $ Age : int 69 69 81 76 75 75 80 74 71 76 ...
## $ AgeExam : num 69 69 81 76 75 75 80 74 71 76 ...
## $ AgeOnset : num NA NA NA NA NA NA NA NA NA NA ...
## $ AgeDeath : int NA NA NA NA NA NA NA NA NA NA ...
## $ Centre_APOE : int 33 34 33 33 33 33 23 33 23 33 ...
## $ Centre : chr "OVIEDO" "OVIEDO" "OVIEDO" "OVIEDO" ...
## $ rs429358 : chr "TT" "CT" "TT" "TT" ...
## $ rs7412 : chr "CC" "CC" "CC" "CC" ...
## $ LGC_APOE : int 33 34 33 33 33 33 23 33 23 33 ...
## $ LGC_E4. : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FID : chr "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
## $ IID : chr "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
## $ PID : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MID : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Sex : int 1 2 1 2 2 1 1 2 2 1 ...
## $ Dx : int 2 2 2 2 2 2 2 2 2 2 ...
## $ SORT1_rs17585355 : chr "AA" "AA" "AA" "AA" ...
## $ SORT1_rs2228604 : chr "CC" "CC" "AC" "AC" ...
## $ SORT1_rs7536292 : chr "TT" "CT" "TT" "TT" ...
## $ SORT1_rs17646665 : chr "AA" "AA" "AA" "AA" ...
## $ SORT1_rs1149175 : chr "GG" "GG" "GG" "GG" ...
## $ SORT1_rs10745354 : chr "TT" "TT" "CT" "CT" ...
## $ IL10_rs1800896 : chr "AA" "GA" "GG" "GA" ...
## $ BIN1_rs744373 : chr "GG" "GA" "GA" "GG" ...
## [list output truncated]
Reading the snps that we want to study
vector_of_gene_snps <- scan(file.path(input, "snps.txt"),
what = "character",
quiet = !verbose)
vector_of_gene_snps
## [1] "TCN2_rs5749131" "TCN2_rs5753231" "TCN2_rs16988828"
## [4] "TCN2_rs9606756" "TCN2_rs757874" "TCN2_rs1801198"
## [7] "TCN2_rs5749135" "TCN2_rs9621049" "TCN2_rs1131603"
## [10] "TCN2_rs7289553" "TCN2_rs5997711" "MTRR_rs1801394"
## [13] "MTRR_rs13181011" "MTRR_rs326120" "MTRR_rs1532268"
## [16] "MTRR_rs7703033" "MTRR_rs6555501" "MTRR_rs162035"
## [19] "MTRR_rs3815743" "MTRR_rs162040" "MTRR_rs10475399"
## [22] "SORT1_rs17585355" "SORT1_rs2228604" "SORT1_rs7536292"
## [25] "SORT1_rs17646665" "SORT1_rs1149175" "SORT1_rs10745354"
## [28] "BDNF_rs6265" "BDNF_rs11030102" "BDNF_rs11030104"
## [31] "BDNF_rs11030119" "BDNF_rs12288512" "SHMT1_rs16961153"
## [34] "SHMT1_rs1979277" "SHMT1_rs669340" "SHMT1_rs643333"
## [37] "SHMT1_rs9901160" "RFC1_rs11096990" "RFC1_rs4975002"
## [40] "RFC1_rs4975009" "RFC1_rs6851075" "APOE_rs429358"
## [43] "APOE_rs7412" "APOE_rs597668" "ABCA1_rs1800977"
## [46] "ABCA1_rs2422493" "CYP46A1_rs7157609" "CYP46A1_rs4900442"
## [49] "DBH_rs1611115" "DBH_rs1611131" "GAB2_rs4945261"
## [52] "GAB2_rs2373115" "MAPT_rs242557" "MAPT_rs2471738"
## [55] "NPC1_rs2236707" "NPC1_rs4800488" "NR1H2_rs2695121"
## [58] "NR1H2_rs1052533" "NTRK2_rs1187323" "NTRK2_rs1545285"
## [61] "PEMT_rs7946" "PEMT_rs12325817" "VDR_rs731236"
## [64] "VDR_rs7975232" "BIN1_rs744373" "CD33_rs3865444"
## [67] "CDK5_rs2069442" "HMDX1_rs2071746" "HMGCR_rs3931914"
## [70] "IL10_rs1800896" "LRP1_rs1799986" "MTHFD1L_rs11754661"
## [73] "PRNP_rs1799990" "SLC19A1_rs1051266" "TRAK2_rs13022344"
length(vector_of_gene_snps)
## [1] 75
Chosing other variables of interest
variables <- c("ID", "Diag", "SexLett", "Age", "Centre", "LGC_E4.")
Subsetting the dataframe with the desired variables
df <- df[, c(variables, vector_of_gene_snps)]
dim(df)
## [1] 2441 81
After that, we order the genotypes so that in the heterozygotes are always sorted by alphabetical order (“A”, “C”, “G”, “T”).
df[,c(vector_of_gene_snps)] <- lapply(df[,c(vector_of_gene_snps)], order_heterozygotes)
Then we create a list of objects that for each SNP contains information about its name, gene it belongs, genotype counts, genotype frequencies, allele frequencies and counts, minor allele and major allele etc.
list_of_objects <- generate_object(exists = exists('list_of_objects'),
dataset = df,
snps = vector_of_gene_snps,
verbose = verbose)
list_of_objects
## This is just a convenience function, invocated when print() is used, to provide an overview of the genes and snps present in this list. To view its contents, use list_of_objects$your_snp_id
##
##
## $snps
## [1] "rs5749131" "rs5753231" "rs16988828" "rs9606756" "rs757874"
## [6] "rs1801198" "rs5749135" "rs9621049" "rs1131603" "rs7289553"
## [11] "rs5997711" "rs1801394" "rs13181011" "rs326120" "rs1532268"
## [16] "rs7703033" "rs6555501" "rs162035" "rs3815743" "rs162040"
## [21] "rs10475399" "rs17585355" "rs2228604" "rs7536292" "rs17646665"
## [26] "rs1149175" "rs10745354" "rs6265" "rs11030102" "rs11030104"
## [31] "rs11030119" "rs12288512" "rs16961153" "rs1979277" "rs669340"
## [36] "rs643333" "rs9901160" "rs11096990" "rs4975002" "rs4975009"
## [41] "rs6851075" "rs429358" "rs7412" "rs597668" "rs1800977"
## [46] "rs2422493" "rs7157609" "rs4900442" "rs1611115" "rs1611131"
## [51] "rs4945261" "rs2373115" "rs242557" "rs2471738" "rs2236707"
## [56] "rs4800488" "rs2695121" "rs1052533" "rs1187323" "rs1545285"
## [61] "rs7946" "rs12325817" "rs731236" "rs7975232" "rs744373"
## [66] "rs3865444" "rs2069442" "rs2071746" "rs3931914" "rs1800896"
## [71] "rs1799986" "rs11754661" "rs1799990" "rs1051266" "rs13022344"
##
## $genes
## [1] "TCN2" "MTRR" "SORT1" "BDNF" "SHMT1" "RFC1" "APOE"
## [8] "ABCA1" "CYP46A1" "DBH" "GAB2" "MAPT" "NPC1" "NR1H2"
## [15] "NTRK2" "PEMT" "VDR" "BIN1" "CD33" "CDK5" "HMDX1"
## [22] "HMGCR" "IL10" "LRP1" "MTHFD1L" "PRNP" "SLC19A1" "TRAK2"
##
##
## The list has a total of 75 snps and 28 genes
Each of the SNP objects present in the list_of_objects have the following structure:
list_of_objects[[1]]
## ###### rs5749131 ######
##
## [gene] TCN2
##
## [id] rs5749131
##
## [geno_count] AA = 173; AG = 478; GG = 288;
##
## [geno_freq] AA = 18.424; AG = 50.905; GG = 30.671;
##
## [allele_count] A = 824; G = 1054;
##
## [allele_freq] A = 43.876; G = 56.124;
##
## [minor_allele] A
##
## [major_allele] G
We can extract just the reference snp id from this object
vector_of_snps <- sapply(list_of_objects, function(x) x$id)
vector_of_snps <- unname(vector_of_snps)
vector_of_snps
## [1] "rs5749131" "rs5753231" "rs16988828" "rs9606756" "rs757874"
## [6] "rs1801198" "rs5749135" "rs9621049" "rs1131603" "rs7289553"
## [11] "rs5997711" "rs1801394" "rs13181011" "rs326120" "rs1532268"
## [16] "rs7703033" "rs6555501" "rs162035" "rs3815743" "rs162040"
## [21] "rs10475399" "rs17585355" "rs2228604" "rs7536292" "rs17646665"
## [26] "rs1149175" "rs10745354" "rs6265" "rs11030102" "rs11030104"
## [31] "rs11030119" "rs12288512" "rs16961153" "rs1979277" "rs669340"
## [36] "rs643333" "rs9901160" "rs11096990" "rs4975002" "rs4975009"
## [41] "rs6851075" "rs429358" "rs7412" "rs597668" "rs1800977"
## [46] "rs2422493" "rs7157609" "rs4900442" "rs1611115" "rs1611131"
## [51] "rs4945261" "rs2373115" "rs242557" "rs2471738" "rs2236707"
## [56] "rs4800488" "rs2695121" "rs1052533" "rs1187323" "rs1545285"
## [61] "rs7946" "rs12325817" "rs731236" "rs7975232" "rs744373"
## [66] "rs3865444" "rs2069442" "rs2071746" "rs3931914" "rs1800896"
## [71] "rs1799986" "rs11754661" "rs1799990" "rs1051266" "rs13022344"
And then we can convert the categorical variables in the dataframe into the factor datatype.
df[-c(1,4)] <- lapply(df[-c(1,4)], as.factor)
Imputated data set preparation
First we read and store the the imputated imput data.
encoded_NA <- c("#NULL!", "NA")
imput_df <- read.csv2(file.path(input, "imputated.csv"),
na.strings = encoded_NA)
str(imput_df, list.len = 30)
## 'data.frame': 6357 obs. of 93 variables:
## $ id : int 3406 4010 12046 3784 2210 8831 1106 5426 3578 8432 ...
## $ sex : chr "Woman" "Woman" "Woman" "Woman" ...
## $ age.at.entry : num 58 58 59 58 60 60 55 57 57 57 ...
## $ Age.to.Use : num 60 60 60 60 60 60 60 61 61 61 ...
## $ Age75 : chr "lessthan75" "lessthan75" "lessthan75" "lessthan75" ...
## $ prevalent_dementia: int 0 0 0 0 0 0 0 0 0 0 ...
## $ incident_dementia : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Dx : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DiagName : chr "CTRL" "CTRL" "CTRL" "CTRL" ...
## $ ageatdementia : num NA NA NA NA NA NA NA NA NA NA ...
## $ followup : num 2.0589 1.8563 1.2238 1.9439 0.0493 ...
## $ E4status : chr "E4+" "2" "E4-" "E4-" ...
## $ apoe : int NA NA NA NA NA NA NA NA NA NA ...
## $ rs429358 : num 1.017 0.981 1.999 2 1.994 ...
## $ rs7412 : num 1 1.05 1 2 2 ...
## $ rs670139 : num 1 2 2 1 2 2 0 2 1 2 ...
## $ rs242557 : int 2 NA 1 NA 1 0 0 1 1 1 ...
## $ rs1052533 : int 1 NA 0 1 1 1 2 1 0 0 ...
## $ rs2471738 : int 2 1 0 0 0 0 0 0 1 1 ...
## $ rs4975002 : num 0.001 1.001 1.999 1 0.999 ...
## $ rs1131603 : num 2 2 1.08 2 1.66 ...
## $ rs7975232 : num 0 2 0 1.07 2 ...
## $ rs2695121 : num 1 2 1.03 2 1.01 ...
## $ rs3865444 : num 1.999 0.141 1.005 2 0.784 ...
## $ rs4975009 : num 0.006 1 2 0 0.808 ...
## $ rs1800977 : num 1 2 0 0 1 2 1 2 1 1 ...
## $ rs709149 : num 1 2 1 1 1 0 2 0 1 1 ...
## $ rs12288512 : num 1 2 2 1 1 ...
## $ rs4945261 : int 2 1 0 1 2 2 2 2 2 1 ...
## $ rs7946 : num 0.008 1 0 0 0 0 0 1 0 0 ...
## [list output truncated]
Then we subset the data of interest.
imput_variables <- c("id", "DiagName", "sex", "Age.to.Use", "E4status")
imput_df <- imput_df[,c(imput_variables, vector_of_snps)]
dim(imput_df)
## [1] 6357 80
Then, we import data from the allele schema used in the imputation, in order to reconvert it back to genotypes.
imput_scheme <- read.csv(file.path(input, "imputation_scheme.csv"), sep = ";")
imput_scheme <- imput_scheme[imput_scheme$SNP %in% vector_of_snps,]
colnames(imput_scheme) <- c("snp_id", "reference", "alternative")
There is still some snp that is not found in the reference scheme. It seems some of the snps in the dataset were directly genotyped and codified as dummy allele dosage data, so we will use the human reference genome GRCh37 to recodify them.
genotyped_snps <- setdiff(vector_of_snps, imput_scheme$snp_id)
genotyped_snps
## [1] "rs1052533"
GRCh37 <- read.table(file.path(input, "GRCh37_snps.txt"), quote = "\"", comment.char = "")
colnames(GRCh37) <- c("chromosome", "location", "snp_id", "reference", "alternative")
index <- nrow(imput_scheme)
for (snp in genotyped_snps) {
i <- which(genotyped_snps == snp)
imput_scheme[index + i,] <- GRCh37[GRCh37$snp_id == snp,][,c(3,4,5)]
}
row.names(imput_scheme) <- NULL
So the final dataframe used to recodify the imputated data into genotypes will be the following one:
imput_scheme
|
snp_id
|
reference
|
alternative
|
|
rs10475399
|
A
|
G
|
|
rs1051266
|
T
|
C
|
|
rs1052533
|
G
|
A
|
|
rs10745354
|
T
|
C
|
|
rs11030102
|
C
|
G
|
|
rs11030104
|
A
|
G
|
|
rs11030119
|
G
|
A
|
|
rs11096990
|
C
|
T
|
|
rs1131603
|
T
|
C
|
|
rs1149175
|
G
|
A
|
|
rs11754661
|
G
|
A
|
|
rs1187323
|
A
|
C
|
|
rs12288512
|
G
|
A
|
|
rs12325817
|
C
|
G
|
|
rs13022344
|
T
|
C
|
|
rs13181011
|
T
|
C
|
|
rs1532268
|
C
|
T
|
|
rs1545285
|
T
|
G
|
|
rs1611115
|
C
|
T
|
|
rs1611131
|
A
|
G
|
|
rs162035
|
G
|
A
|
|
rs162040
|
A
|
C
|
|
rs16961153
|
G
|
A
|
|
rs16988828
|
A
|
G
|
|
rs17585355
|
A
|
C
|
|
rs17646665
|
A
|
G
|
|
rs1799986
|
C
|
T
|
|
rs1799990
|
A
|
G
|
|
rs1800896
|
T
|
C
|
|
rs1800977
|
G
|
A
|
|
rs1801198
|
C
|
G
|
|
rs1801394
|
A
|
G
|
|
rs1979277
|
G
|
A
|
|
rs2069442
|
C
|
G
|
|
rs2071746
|
A
|
T
|
|
rs2228604
|
G
|
T
|
|
rs2236707
|
C
|
T
|
|
rs2373115
|
C
|
A
|
|
rs2422493
|
G
|
A
|
|
rs242557
|
A
|
G
|
|
rs2471738
|
T
|
C
|
|
rs2695121
|
C
|
T
|
|
rs326120
|
A
|
G
|
|
rs3815743
|
A
|
G
|
|
rs3865444
|
C
|
A
|
|
rs3931914
|
C
|
G
|
|
rs429358
|
T
|
C
|
|
rs4800488
|
A
|
C
|
|
rs4900442
|
C
|
T
|
|
rs4945261
|
G
|
A
|
|
rs4975002
|
T
|
G
|
|
rs4975009
|
T
|
C
|
|
rs5749131
|
G
|
A
|
|
rs5749135
|
C
|
T
|
|
rs5753231
|
C
|
T
|
|
rs597668
|
T
|
C
|
|
rs5997711
|
C
|
T
|
|
rs6265
|
C
|
T
|
|
rs643333
|
G
|
T
|
|
rs6555501
|
T
|
C
|
|
rs669340
|
C
|
G
|
|
rs6851075
|
T
|
C
|
|
rs7157609
|
G
|
A
|
|
rs7289553
|
A
|
G
|
|
rs731236
|
A
|
G
|
|
rs7412
|
C
|
T
|
|
rs744373
|
A
|
G
|
|
rs7536292
|
T
|
C
|
|
rs757874
|
G
|
T
|
|
rs7703033
|
G
|
A
|
|
rs7946
|
C
|
T
|
|
rs7975232
|
C
|
A
|
|
rs9606756
|
A
|
G
|
|
rs9621049
|
C
|
T
|
|
rs9901160
|
G
|
A
|
First, though the imputation data is converted to the numeric datatype.
imput_df[, 6:ncol(imput_df)] <- lapply(imput_df[,6:ncol(imput_df)],
function(x) as.numeric(x))
Then we proceed to the recoding of the imputation data into genotyped one.
imput_df <- genotype_imputated_df(list_of_objects = list_of_objects,
df = imput_df,
scheme = imput_scheme,
match_strands = T,
verbose = verbose)
Then again like we did with the genotyped dataset, we order the genotypes so that in the heterozygotes are always sorted by alphabetical order (“A”, “C”, “G”, “T”).
imput_df[,c(vector_of_snps)] <- lapply(imput_df[,c(vector_of_snps)], order_heterozygotes)
And we convert data types from characters to factors
imput_df[-c(1,4)] <- lapply(imput_df[-c(1,4)], as.factor)
Merging imputated and genotyped datasets
Renaming variables names in the datasets
colnames(df) <- c("ID", "Diag", "Sex", "Age_to_use", "Centre", "E4status", vector_of_snps)
colnames(imput_df) <- c("ID", "Diag", "Sex", "Age_to_use", "E4status", vector_of_snps)
As we are studying LOAD, subset cases higher or equal than 60 years old in the datasets
df_old <- df
imput_df_old <- imput_df
df <- df[df$Age_to_use >= 60,]
imput_df <- imput_df[imput_df$Age_to_use >= 60,]
And we can see that 81 cases were removed from the genotype dataset, and 0 from the imputated.
nrow(df_old) - nrow(df)
## [1] 81
# 81
nrow(imput_df_old) - nrow(imput_df)
## [1] 0
# 0
We can calculate the median of the combined population
median(c(df$Age_to_use, imput_df$Age_to_use))
## [1] 82
# 82
Renaming the levels of E4status in the genotyped dataset.
levels(df$E4status)
## [1] "0" "1"
levels(df$E4status) <- c("E4-", "E4+")
And the levels of Diagnosis in the imputated dataset.
levels(imput_df$Diag)
## [1] "CTRL" "dementiaAD"
levels(imput_df$Diag) <- c("Control", "AD")
We also create a binary variable in both datasets that tells whether the age of the individual is equal or above the median of the population. In this case, as we saw earlier it is 82.
df$Age82 <- ifelse(df$Age_to_use >= 82, ">82", "<82")
imput_df$Age82 <- ifelse(imput_df$Age_to_use >= 82, ">82", "<82")
We found a case of unknown gender in the imputated dataset, so we remove it and reset the levels. And then rename them as “Male” and “Female”.
table(imput_df$Sex)
##
## 1 Man Woman
## 1 2528 3828
imput_df <- imput_df[imput_df$Sex != 1,]
imput_df$Sex <- factor(imput_df$Sex)
levels(imput_df$Sex) <- c("Male", "Female")
And we create a new variable “Centre” to keep track that this cases are from the center of Rotterdam.
imput_df$Centre <- "ROTTERDAM"
The last step before merging is extracting the data from Rotterdam to fill up the list_of_objects created in a previous step.
list_of_objects <- generate_object(exists = exists('list_of_objects'),
dataset = imput_df,
snps = vector_of_snps)
Then, the final structure obtained would be similar to the following one:
str(list_of_objects)
## This is just a convenience function, invocated when str() is used, to provide an overview of the list elements. To view its contents, use list_of_objects$your_snp_id
##
## An example of the structure of the objects in the list is the following one:
##
## rs5749131 : List of 14
## $ gene : chr "TCN2"
## $ id : chr "rs5749131"
## $ geno_count : 'table' int [1:3(1d)] 173 478 288
## $ geno_freq : 'table' num [1:3(1d)] 18.4 50.9 30.7
## $ allele_count : Named num [1:2] 824 1054
## $ allele_freq : Named num [1:2] 43.9 56.1
## $ minor_allele : chr "A"
## $ major_allele : chr "G"
## $ imput_geno_count : 'table' int [1:3(1d)] 934 2336 1527
## $ imput_geno_freq : 'table' num [1:3(1d)] 19.5 48.7 31.8
## $ imput_allele_count: Named num [1:2] 4204 5390
## $ imput_allele_freq : Named num [1:2] 43.8 56.2
## $ imput_minor_allele: chr "A"
## $ imput_major_allele: chr "G"
Once we’ve done that, we can proceed with the merging.
matching_variables <- c("ID", "Diag", "Sex", "E4status", "Age_to_use", "Age82", "Centre", vector_of_snps)
All <- merge(x = df, y = imput_df, by = matching_variables, all = T)
str(All, list.len = 30)
## 'data.frame': 8716 obs. of 82 variables:
## $ ID : chr "10" "100" "1000" "10000" ...
## $ Diag : Factor w/ 2 levels "AD","Control": 2 2 1 2 2 2 2 2 2 2 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 1 2 2 1 ...
## $ E4status : Factor w/ 3 levels "E4-","E4+","2": 1 1 2 1 1 1 1 1 2 1 ...
## $ Age_to_use: num 82 77 82 78 92 78 82 83 86 87 ...
## $ Age82 : chr ">82" "<82" ">82" "<82" ...
## $ Centre : Factor w/ 7 levels "BRISTOL","MADRID",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ rs5749131 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 2 3 2 2 2 2 2 ...
## $ rs5753231 : Factor w/ 3 levels "CC","CT","TT": 2 1 1 2 1 2 2 1 1 2 ...
## $ rs16988828: Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 1 1 1 1 ...
## $ rs9606756 : Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 1 2 1 2 ...
## $ rs757874 : Factor w/ 3 levels "GG","GT","TT": 1 1 2 2 1 2 2 1 1 1 ...
## $ rs1801198 : Factor w/ 3 levels "CC","CG","GG": 3 1 2 2 1 2 2 2 2 2 ...
## $ rs5749135 : Factor w/ 3 levels "CC","CT","TT": 1 2 2 2 3 2 2 1 2 1 ...
## $ rs9621049 : Factor w/ 3 levels "CC","CT","TT": 1 2 1 1 1 1 1 2 1 2 ...
## $ rs1131603 : Factor w/ 3 levels "CC","CT","TT": 3 3 3 3 2 3 3 3 3 3 ...
## $ rs7289553 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 2 3 2 2 1 2 2 ...
## $ rs5997711 : Factor w/ 3 levels "CC","CT","TT": 3 1 2 2 1 2 2 2 2 2 ...
## $ rs1801394 : Factor w/ 3 levels "AA","AG","GG": 3 2 1 3 3 3 1 2 3 3 ...
## $ rs13181011: Factor w/ 3 levels "CC","CT","TT": 1 3 3 3 2 2 3 3 2 2 ...
## $ rs326120 : Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 3 2 1 1 ...
## $ rs1532268 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 3 2 2 3 3 2 2 ...
## $ rs7703033 : Factor w/ 3 levels "AA","AG","GG": 3 2 3 1 2 2 3 2 2 2 ...
## $ rs6555501 : Factor w/ 3 levels "CC","CT","TT": 3 2 3 1 2 2 3 2 2 2 ...
## $ rs162035 : Factor w/ 3 levels "AA","AG","GG": 3 3 3 2 3 3 3 3 3 3 ...
## $ rs3815743 : Factor w/ 3 levels "AA","AG","GG": 1 1 2 1 1 1 1 1 1 1 ...
## $ rs162040 : Factor w/ 3 levels "AA","AC","CC": 1 2 1 1 1 1 3 1 1 1 ...
## $ rs10475399: Factor w/ 3 levels "AA","AG","GG": 3 2 2 2 3 3 1 2 3 3 ...
## $ rs17585355: Factor w/ 3 levels "AA","AC","CC": 1 1 1 1 1 1 1 1 1 2 ...
## $ rs2228604 : Factor w/ 3 levels "AA","AC","CC": 3 3 2 2 1 2 3 2 2 2 ...
## [list output truncated]
Before doing any analysis, we should make sure the contrasts are performed with respect to the controls, not the Alzheimer cases.
All$Diag <- relevel(All$Diag, ref = "Control")
Adding binary variables to the merged dataset according to if the data belong to a specific region, being 0 not belonging and 1 belonging.
centres <- levels(All$Centre)
centres <- stringr::str_to_title(centres)
All[centres] <- 0
for (centre in centres) {
All[[centre]][All$Centre == toupper(centre)] <- 1
}
Then we divide the dataset into regions according to the centres.
All$Region <- ifelse(All$Centre == "MADRID" | All$Centre == "OVIEDO" | All$Centre == "SANTANDER", "Spain", "N.Eur")
Quality control
We will perform a quality control of the samples before proceeding to the analysis step. An example of typical quality control procedures usually performed can be found here
Formatting dataset
Once we’ve merged both datasets we can proceed with quality control procedures.
We’ve previosly prepared a map file with the snps under study. It was generated with the GRCh38 genome assembly version.
map <- read.delim(file.path(input, "raw_data.map"), header = FALSE)
colnames(map) <- c("Chr", "SNP", "GD", "BPP")
The file has the following structure:
map
|
Chr
|
SNP
|
GD
|
BPP
|
|
1
|
rs10745354
|
0
|
109389286
|
|
1
|
rs1149175
|
0
|
109379755
|
|
1
|
rs17585355
|
0
|
109315193
|
|
1
|
rs17646665
|
0
|
109369429
|
|
1
|
rs1800896
|
0
|
206773552
|
|
1
|
rs2228604
|
0
|
109342153
|
We can see that the snps ids are exactly the same found in our dataset so we already got all the information we need for the map file.
setdiff(map$SNP, names(list_of_objects))
## character(0)
setdiff(names(list_of_objects), map$SNP)
## character(0)
Now, we take care of the ped file. First, we create an empty ped file structure that we will fill up with the data.
ped <- data.frame(matrix(ncol = 6, nrow = nrow(All)))
colnames(ped) <- c("FID", "IID", "PID", "MID", "Sex", "P")
Then we introduce the columns values for each of the defined variables
ped$FID <- All$ID #It is the same plink does when the flag --no-fid is used
ped$IID <- All$ID
ped$PID <- 0
ped$MID <- 0
ped$Sex <- sapply(All$Sex, function(x) {if (is.na(x)) {0} else if (x == "Male") {1} else if (x == "Female") {2} else {0}})
ped$P <- sapply(All$Diag, function(x) {if (is.na(x)) {-9} else if (x == "Control") {1} else if (x == "AD") {2} else {-9}})
If we take look we can see that the variables have been coded correctly
head(All[,c("ID", "Sex", "Diag")])
## ID Sex Diag
## 1 10 Male Control
## 2 100 Female Control
## 3 1000 Male AD
## 4 10000 Female Control
## 5 10004 Female Control
## 6 10005 Male Control
head(ped[,c("IID", "Sex", "P")])
## IID Sex P
## 1 10 1 1
## 2 100 2 1
## 3 1000 1 2
## 4 10000 2 1
## 5 10004 2 1
## 6 10005 1 1
Now, then the final step would be to add the genotypes, and as the version of plink we are using (PLINK v1.90) parses the genotypes, we don’t need to split the columns we have into allele columns and we can just bind it directly to our ped file.
ped <- cbind(ped, All[,map$SNP])
But we have yet some missing values remaining in our dataset.
colSums(is.na(ped))
## FID IID PID MID Sex P rs10745354
## 0 0 0 0 0 0 468
## rs1149175 rs17585355 rs17646665 rs1800896 rs2228604 rs7536292 rs13022344
## 455 459 447 460 469 493 471
## rs744373 rs11096990 rs4975002 rs4975009 rs6851075 rs10475399 rs13181011
## 493 796 491 467 488 485 452
## rs1532268 rs162035 rs162040 rs1801394 rs326120 rs3815743 rs3931914
## 483 457 477 471 486 494 468
## rs6555501 rs7703033 rs11754661 rs2069442 rs1187323 rs1545285 rs1611115
## 485 471 479 493 478 484 457
## rs1611131 rs1800977 rs2422493 rs11030102 rs11030104 rs11030119 rs12288512
## 841 466 483 492 477 491 464
## rs2373115 rs4945261 rs6265 rs1799986 rs731236 rs7975232 rs4900442
## 462 468 456 502 497 478 458
## rs7157609 rs12325817 rs16961153 rs1979277 rs242557 rs2471738 rs643333
## 509 506 461 480 330 347 477
## rs669340 rs7946 rs9901160 rs2236707 rs4800488 rs1052533 rs2695121
## 455 488 459 471 463 473 482
## rs3865444 rs429358 rs597668 rs7412 rs1799990 rs1051266 rs1131603
## 468 580 485 469 475 472 452
## rs16988828 rs1801198 rs2071746 rs5749131 rs5749135 rs5753231 rs5997711
## 455 472 457 490 457 482 475
## rs7289553 rs757874 rs9606756 rs9621049
## 497 476 464 466
And we convert the missing values to 00, as plink expects.
ped[,map$SNP] <- lapply(ped[,map$SNP], as.character)
ped[is.na(ped)] <- "00"
sum(is.na(ped))
## [1] 0
Then we can save the ped file in order to use it as input for plink.
write.table(ped, file = file.path(input, "raw_data.ped"), quote = F, sep = "\t", row.names = F, col.names = F)
Checking plink version
We will use the command line plink program. To get the version, you can just type:
plink --version
# PLINK v1.90b6.21 64-bit (19 Oct 2020)
## PLINK v1.90b6.21 64-bit (19 Oct 2020)
Making binary files
The first step is to create a folder where to store the output that we will obtain in all subsequent steps using the program.
mkdir -p "$PWD"/data/Output/plink/0-Data
run_shell('mkdir %CD%/data/Output/plink/0-Data')
The next step step will be to convert the files obtained into binary files to speed up the computation.
plink --file "$PWD"/data/Input/raw_data --make-bed --out "$PWD"/data/Output/plink/0-Data/binary_data
run_shell('plink --file %CD%/data/Input/raw_data --make-bed --out %CD%/data/Output/plink/0-Data/binary_data', ignore.stdout = !verbose)
Removing missingness
We first filter SNPs and individuals based on a relaxed threshold (0.2; >20%), as this will filter out SNPs and individuals with very high levels of missingness. Then a filter with a more stringent threshold will be applied (0.02). We do first SNP filtering before individual filtering.
mkdir -p "$PWD"/data/Output/plink/1-QC/1-Missingness
plink --bfile "$PWD"/data/Output/plink/0-Data/binary_data --missing --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing
plink --bfile "$PWD"/data/Output/plink/0-Data/binary_data --geno 0.2 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_1
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_1 --mind 0.2 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_2
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_2 --geno 0.02 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_3
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_3 --mind 0.02 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_4
run_shell('mkdir %CD%/data/Output/plink/1-QC/1-Missingness')
run_shell('plink --bfile %CD%/data/Output/plink/0-Data/binary_data --missing --out %CD%/data/Output/plink/1-QC/1-Missingness/missing', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/0-Data/binary_data --geno 0.2 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_1', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_1 --mind 0.2 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_2', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_2 --geno 0.02 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_3', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_3 --mind 0.02 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_4', ignore.stdout = !verbose)
Removing SNPs with lower MAF than threshold
SNPs with a low MAF are rare, therefore power is lacking for detecting SNP‐phenotype associations. These SNPs are also more prone to genotyping errors. The MAF threshold depends on sample size, larger samples can use lower MAF thresholds. As the sample size in this study is moderate (N = 8716), we use the 0.05 as MAF threshold.
mkdir "$PWD"/data/Output/plink/1-QC/2-MAF
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_4 --maf 0.05 --make-bed --out "$PWD"/data/Output/plink/1-QC/2-MAF/minor_allele
run_shell('mkdir %CD%/data/Output/plink/1-QC/2-MAF')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_4 --maf 0.05 --make-bed --out %CD%/data/Output/plink/1-QC/2-MAF/minor_allele', ignore.stdout = !verbose)
Looking at deviations from Hardy-Weinberg equilibrium
Deviations from HWE is a common indicator of genotyping error, but it may also indicate evolutionary selection. We will use a threshold of 10^-6 for controls and 10^-10 for cases.
mkdir "$PWD"/data/Output/plink/1-QC/3-HWE
plink --bfile "$PWD"/data/Output/plink/1-QC/2-MAF/minor_allele --hwe 1e-6 --make-bed --out "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg --hwe 1e-10 include-nonctrl --make-bed --out "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1
run_shell('mkdir %CD%/data/Output/plink/1-QC/3-HWE')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/2-MAF/minor_allele --hwe 1e-6 --make-bed --out %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg --hwe 1e-10 include-nonctrl --make-bed --out %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1', ignore.stdout = !verbose)
Removing individuals with a heterozygosity rate deviating more than 3SD from the mean
Deviations can indicate sample contamination or inbreeding. Heterozygosity checks are performed on a set of SNPs that are not highly correlated. To generate this list of non-(highly)correlated SNPs, we exclude high inversion regions.
mkdir "$PWD"/data/Output/plink/1-QC/4-Het
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --exclude "$PWD"/data/Input/inversion.txt --range --indep-pairwise 50 5 0.2 --out "$PWD"/data/Output/plink/1-QC/4-Het/indepSNP
run_shell('mkdir %CD%/data/Output/plink/1-QC/4-Het')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --exclude %CD%/data/Input/inversion.txt --range --indep-pairwise 50 5 0.2 --out %CD%/data/Output/plink/1-QC/4-Het/indepSNP', ignore.stdout = !verbose)
Once we’ve done that we can compute the heterozygosity rates
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --extract "$PWD"/data/Output/plink/1-QC/4-Het/indepSNP.prune.in --het --out "$PWD"/data/Output/plink/1-QC/4-Het/Het_check
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --extract %CD%/data/Output/plink/1-QC/4-Het/indepSNP.prune.in --het --out %CD%/data/Output/plink/1-QC/4-Het/Het_check', ignore.stdout = !verbose)
From this file we can generate a list of individuals who deviate more than 3 standard deviations from the heterozigosity rate mean.
# All credit to Andries Marees at https://github.com/MareesAT/GWA_tutorial
het <- read.table(file.path(output, "plink", "1-QC", "4-Het", "Het_check.het"), head = TRUE)
het$HET_RATE <- (het$"N.NM." - het$"O.HOM.")/het$"N.NM."
het_fail <- subset(het, (het$HET_RATE < mean(het$HET_RATE) - 3*sd(het$HET_RATE)) | (het$HET_RATE > mean(het$HET_RATE) + 3*sd(het$HET_RATE)))
het_fail$HET_DST <- (het_fail$HET_RATE - mean(het$HET_RATE))/sd(het$HET_RATE)
write.table(het_fail[,1:2], file.path(output, "plink", "1-QC", "4-Het", "fail-het-qc.txt"), row.names = FALSE, quote = F)
Now, we proceed to remove those heterozygosity rate outliers
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --remove "$PWD"/data/Output/plink/1-QC/4-Het/fail-het-qc.txt --make-bed --out "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --remove %CD%/data/Output/plink/1-QC/4-Het/fail-het-qc.txt --make-bed --out %CD%/data/Output/plink/1-QC/4-Het/heterozigosity', ignore.stdout = !verbose)
Population structure analysis
We can also check for population substrure in our dataset, for instance by performing a PCA.
This additional analysis is only actually performed on Unix computers, if in windows I have just provided the commands used and and example of the output obtained. It also isn’t run by default, because of the bottleneck caused by this additional analysis. The results obtained are shown as recorded on a .png file of the PCA plot. You can control if the analysis should be performed by setting the run_PCA variable as TRUE or FALSE.
run_PCA <- FALSE
For that, we downloaded a 1000 genomes file of 629 individuals which were genotyped in blablabla SNPs. This file was downloaded from here: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz.
After downloading it the file was converted to plink files, and performed QC as described in the Population Stratification tutorial in https://github.com/MareesAT/GWA_tutorial.
Removing unshared SNPs
Just the snps present in both datasets were kept to be able to make the comparison. Due to file size, of the thousand genomes files, we’ve already extracted the relevant SNPs from the thousand genomes plink files. To do so for the EP dataset we might do:
mkdir -p "$PWD"/data/Output/plink/2-POP/
awk '{print$2}' "$PWD"/data/Input/1KG_PCA/1KG.bim > "$PWD"/data/Output/plink/2-POP/1KG_snps.txt
plink --bfile "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity --extract "$PWD"/data/Output/plink/2-POP/1KG_snps.txt --make-bed --recode --out "$PWD"/data/Output/plink/2-POP/EP
cat "$PWD"/data/Output/plink/2-POP/1KG_snps.txt | wc -l
Update build
Then, the next step would be to check that both datasets have the same build and if not update the older 1KG file accordingly.
awk '{print$2,$4}' "$PWD"/data/Output/plink/2-POP/EP.map > "$PWD"/data/Output/plink/2-POP/buildEP.txt
plink --bfile "$PWD"/data/Input/1KG_PCA/1KG --update-map "$PWD"/data/Output/plink/2-POP/buildEP.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/1KG_PCA
Prepare merging
In order to merge both files, first we have to ascertain that both files refer to the same reference and alternative alleles.
awk '{print$2,$5}' "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim > "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt
plink --bfile "$PWD"/data/Output/plink/2-POP/EP --reference-allele "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_adj
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/EP_adj.bim > "$PWD"/data/Output/plink/2-POP/EP_adj_tmp
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim > "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp
sort "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp "$PWD"/data/Output/plink/2-POP/EP_adj_tmp | uniq -u > "$PWD"/data/Output/plink/2-POP/all_differences.txt
cat "$PWD"/data/Output/plink/2-POP/all_differences.txt
Those differences might be just caused by looking at different strands so we will flip those bases and see if the error persists.
awk '{print$1}' "$PWD"/data/Output/plink/2-POP/all_differences.txt | sort -u > "$PWD"/data/Output/plink/2-POP/flip_list.txt
plink --bfile "$PWD"/data/Output/plink/2-POP/EP_adj --flip "$PWD"/data/Output/plink/2-POP/flip_list.txt --reference-allele "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_adj_2
## Now check if they are still problematic
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/EP_adj_2.bim > "$PWD"/data/Output/plink/2-POP/EP_adj_2_tmp
sort "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp "$PWD"/data/Output/plink/2-POP/EP_adj_2_tmp | uniq -u > "$PWD"/data/Output/plink/2-POP/still_differences.txt
cat "$PWD"/data/Output/plink/2-POP/still_differences.txt
Merge
And we can see that there are no remaining differences anymore so we shouldn’t remove any SNP and we can proceed to merge both files.
plink --bfile "$PWD"/data/Output/plink/2-POP/EP_adj_2 --bmerge "$PWD"/data/Output/plink/2-POP/1KG_PCA.bed "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim "$PWD"/data/Output/plink/2-POP/1KG_PCA.fam --allow-no-sex --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_1KG_PCA
Labeling the superpopulations
Before doing the PCA we can label the data according to different human superpopulations (AFR, EUR etc) in order to be able to keep track of them when we plot them graphically. For that we get a file with population information from the 1000 genomes and transform the subpopulation codes in our files in superpopulation ones.
# Note: for macOS/BSD users use GNU sed instead of default sed, linux users may use sed directly instead of gsed
wget -P "$PWD"/data/Output/plink/2-POP/ ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/20100804.ALL.panel
## rename populations
awk '{print$1,$1,$2}' "$PWD"/data/Output/plink/2-POP/20100804.ALL.panel > "$PWD"/data/Output/plink/2-POP/pop_1kG.txt
gsed 's/JPT/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG2.txt
gsed 's/ASW/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG2.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG3.txt
gsed 's/CEU/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG3.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG4.txt
gsed 's/CHB/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG4.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG5.txt
gsed 's/CHD/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG5.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG6.txt
gsed 's/YRI/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG6.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG7.txt
gsed 's/LWK/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG7.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG8.txt
gsed 's/TSI/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG8.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG9.txt
gsed 's/MXL/AMR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG9.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG10.txt
gsed 's/GBR/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG10.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG11.txt
gsed 's/FIN/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG11.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG12.txt
gsed 's/CHS/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG12.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG13.txt
gsed 's/PUR/AMR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG13.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG14.txt
# Create our own population file
awk '{print$1,$2,"EP"}' "$PWD"/data/Output/plink/2-POP/EP_adj_2.fam > "$PWD"/data/Output/plink/2-POP/pop_EP.txt
# Concatenate population files.
cat "$PWD"/data/Output/plink/2-POP/pop_1kG14.txt "$PWD"/data/Output/plink/2-POP/pop_EP.txt | gsed -e '1i\FID IID SUPERPOP' > "$PWD"/data/Output/plink/2-POP/popfile.txt
# remove temporary files from above
rm -f "$PWD"/data/Output/plink/2-POP/pop_*.txt
PCA
With the data merged we can perform PCA on plink.
plink --bfile "$PWD"/data/Output/plink/2-POP/EP_1KG_PCA --pca --out "$PWD"/data/Output/plink/2-POP/PCA_results
We will plot the results using a small Python script (all credit to Xavier Farré)
# engine.opts='-l' loads files that are normally loaded when you start the shell like for instance ~/.bash_profile. In this case, python 3 is aliased to python, instead of using python 2.7.
python --version
python "$PWD"/data/Input/1KG_PCA/plot_PCA.py
Defining inheritance models
The next step is to obtaining all possible snp inheritance model combinations and adding the new columns to the dataset.
recessive <- paste0(vector_of_snps, "r")
additive <- paste0(vector_of_snps, "a")
dominant <- paste0(vector_of_snps, "d")
inheritance <- c(recessive, additive, dominant)
All[inheritance] <- NA
Then we define the variables (columns) order in the dataset.
col_order <- c("ID", "Diag", "Sex", "Age_to_use", "Age82", "Region", "E4status", "Centre", centres, vector_of_snps, inheritance)
All <- All[,col_order]
Finally we code the inheritance with numbers depending on the genotype values for each snp. (0 reference, 1 risk allele effect, 2 two risk alleles effect {just for the additive model})
All <- generate_inheritances(snps = list_of_objects, data = All)
Diagnose possible problems and perform data conversions
After a quick look at the data we decide to codify an unexplained level in the E4status variable (coming from Rotterdam dataset) into missing data.
All$E4status[All$E4status == 2] <- NA
Then we convert all categorical variables into the factor class.
factors <- -c(1,4)
All[,factors] <- lapply(All[,factors], factor)
The next step to be performed is to do some further data exploration by creating contingency tables of age, sex and E4status by diagnosis and centre.
variables <- c("Age82", "Sex", "E4status", "Diag", "Centre", "Region")
list_of_tables <- vector(mode = "list", length = 2)
i = 2 # Two pairs of variables cross tabulation
while (i < 4) {
# Generate all combinations of variables possible, taking i at a time.
combinations <- combn(variables, i, simplify = FALSE)
list_of_tables[[i - 1]] <- character(length = length(combinations))
# For combination in combinations
for (n in seq_along(combinations)) {
combination <- combinations[[n]]
# Create the formula to input to the xtabs function
formula <- paste0("All$", combination)
formula <- paste(formula, collapse = " + ")
formula <- paste0("~", formula)
# Create the name of the table as variable1_variable2
name <- paste(combination, collapse = "_")
assign(name, xtabs(formula))
list_of_tables[[i - 1]][n] <- name
}
# Once all possible 2 variables combinations have been performed, do it
# for three variables
i = i + 1
}
names(list_of_tables) <- c("Two variables", "Three variables")
We can take a quick look at an overview of the contingency tables created by printing the list_of_tables object.
list_of_tables
## $`Two variables`
## [1] "Age82_Sex" "Age82_E4status" "Age82_Diag" "Age82_Centre"
## [5] "Age82_Region" "Sex_E4status" "Sex_Diag" "Sex_Centre"
## [9] "Sex_Region" "E4status_Diag" "E4status_Centre" "E4status_Region"
## [13] "Diag_Centre" "Diag_Region" "Centre_Region"
##
## $`Three variables`
## [1] "Age82_Sex_E4status" "Age82_Sex_Diag" "Age82_Sex_Centre"
## [4] "Age82_Sex_Region" "Age82_E4status_Diag" "Age82_E4status_Centre"
## [7] "Age82_E4status_Region" "Age82_Diag_Centre" "Age82_Diag_Region"
## [10] "Age82_Centre_Region" "Sex_E4status_Diag" "Sex_E4status_Centre"
## [13] "Sex_E4status_Region" "Sex_Diag_Centre" "Sex_Diag_Region"
## [16] "Sex_Centre_Region" "E4status_Diag_Centre" "E4status_Diag_Region"
## [19] "E4status_Centre_Region" "Diag_Centre_Region"
Another step we can perform for the sake of error checking is creating summary tables of mean, min, max age by diagnosis and centre.
First, we give variables shorter names for ease of use.
ages <- All$Age_to_use
diagnostic <- All$Diag
centres <- All$Centre
regions <- All$Region
Then we create the summary tables.
### Only Age by diagnosis
variable <- ages
group <- diagnostic
AgeAll <- createSummary(variable, group)
### Age of alzheimer patients by Centre
variable <- ages[diagnostic == "AD"]
group <- centres[diagnostic == "AD"]
AgeAD <- createSummary(variable, group)
### Age of control patients by Centre
variable <- ages[diagnostic == "Control"]
group <- centres[diagnostic == "Control"]
AgeControl <- createSummary(variable, group)
### Age of alzheimer patients by Region (N.Eur or Spain)
variable <- ages[diagnostic == "AD"]
group <- regions[diagnostic == "AD"]
AgeADRegion <- createSummary(variable, group)
### Age of control patients by Region (N.Eur or Spain)
variable <- ages[diagnostic == "Control"]
group <- regions[diagnostic == "Control"]
AgeControlRegion <- createSummary(variable, group)
summary_tables <- c("AgeAll", "AgeAD", "AgeControl", "AgeADRegion", "AgeControlRegion")
for (summary in summary_tables) {
print(summary)
print(get(summary))
cat("\n")
}
## [1] "AgeAll"
## [1] ""
## [2] " Descriptive statistics by group "
## [3] "group: Control"
## [4] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [5] "X1 1 6057 81.92 7.75 82 82.09 7.41 60 106 46 -0.19 0.1 0.1"
## [6] "------------------------------------------------------------ "
## [7] "group: AD"
## [8] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [9] "X1 1 2659 81 7.95 81 81.24 8.9 60 109 49 -0.24 -0.21 0.15"
##
## [1] "AgeAD"
## [1] ""
## [2] " Descriptive statistics by group "
## [3] "group: BRISTOL"
## [4] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [5] "X1 1 210 80.33 8.59 81 80.77 8.9 60 99 39 -0.4 -0.3 0.59"
## [6] "------------------------------------------------------------ "
## [7] "group: MADRID"
## [8] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [9] "X1 1 232 75.09 9.8 74 74.06 8.9 60 109 49 0.96 0.74 0.64"
## [10] "------------------------------------------------------------ "
## [11] "group: NOTTINGHAM"
## [12] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [13] "X1 1 253 82.74 8.3 84 83.11 7.41 61 101 40 -0.41 -0.15 0.52"
## [14] "------------------------------------------------------------ "
## [15] "group: OPTIMA"
## [16] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [17] "X1 1 268 79.29 8.04 80 79.59 8.9 60 100 40 -0.26 -0.5 0.49"
## [18] "------------------------------------------------------------ "
## [19] "group: OVIEDO"
## [20] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [21] "X1 1 256 76.85 5.76 77 76.89 5.93 63 94 31 -0.05 -0.17 0.36"
## [22] "------------------------------------------------------------ "
## [23] "group: SANTANDER"
## [24] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [25] "X1 1 226 77.35 6.59 78 77.58 5.93 60 98 38 -0.24 0.11 0.44"
## [26] "------------------------------------------------------------ "
## [27] "group: ROTTERDAM"
## [28] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [29] "X1 1 1214 83.81 6.48 84 84.03 5.93 61 101 40 -0.34 -0.03 0.19"
##
## [1] "AgeControl"
## [1] ""
## [2] " Descriptive statistics by group "
## [3] "group: BRISTOL"
## [4] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [5] "X1 1 88 81.32 8.21 81.5 81.58 9.64 62 96 34 -0.22 -0.68 0.88"
## [6] "------------------------------------------------------------ "
## [7] "group: MADRID"
## [8] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [9] "X1 1 278 75.94 10.57 74 75.11 11.12 60 104 44 0.64 -0.24 0.63"
## [10] "------------------------------------------------------------ "
## [11] "group: NOTTINGHAM"
## [12] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [13] "X1 1 148 78.43 10.13 78.5 78.42 11.12 60 104 44 -0.04 -0.84 0.83"
## [14] "------------------------------------------------------------ "
## [15] "group: OPTIMA"
## [16] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [17] "X1 1 208 79.76 7.42 80 79.9 7.41 62 100 38 -0.1 -0.18 0.51"
## [18] "------------------------------------------------------------ "
## [19] "group: OVIEDO"
## [20] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [21] "X1 1 87 73.24 4.57 74 73.46 4.45 62 87 25 -0.28 0.26 0.49"
## [22] "------------------------------------------------------------ "
## [23] "group: SANTANDER"
## [24] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [25] "X1 1 106 80.95 7.56 81.5 81.03 8.15 62 99 37 -0.11 -0.3 0.73"
## [26] "------------------------------------------------------------ "
## [27] "group: ROTTERDAM"
## [28] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [29] "X1 1 5142 82.6 7.26 83 82.72 7.41 60 106 46 -0.14 0.22 0.1"
##
## [1] "AgeADRegion"
## [1] ""
## [2] " Descriptive statistics by group "
## [3] "group: N.Eur"
## [4] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [5] "X1 1 1945 82.67 7.41 83 83.05 7.41 60 101 41 -0.48 0.11 0.17"
## [6] "------------------------------------------------------------ "
## [7] "group: Spain"
## [8] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [9] "X1 1 714 76.44 7.59 76 76.25 7.41 60 109 49 0.44 0.9 0.28"
##
## [1] "AgeControlRegion"
## [1] ""
## [2] " Descriptive statistics by group "
## [3] "group: N.Eur"
## [4] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [5] "X1 1 5586 82.37 7.42 82 82.52 7.41 60 106 46 -0.18 0.2 0.1"
## [6] "------------------------------------------------------------ "
## [7] "group: Spain"
## [8] " vars n mean sd median trimmed mad min max range skew kurtosis se"
## [9] "X1 1 471 76.57 9.44 75 76.03 8.9 60 104 44 0.54 -0.06 0.43"
Creating data subsets
We create a data subset for each of the regions.
for (region in levels(All$Region)) {
subset <- subset(All, Region == region)
assign(region, subset)
}
And also for each individual center.
for (centre in levels(All$Centre)) {
subset <- subset(All, Centre == centre)
centre <- stringr::str_to_title(centre)
assign(centre, subset)
}
Now, that we’ve done that it would be interesting to also subset according to each one of the covariates.
variables <- c("Age82", "Sex", "E4status")
for (variable in variables) {
for (level in levels(All[[variable]])) {
name <- paste(variable, level, sep = "_")
subset <- All[All[[variable]] == level,]
assign(name, subset)
}
}
And also subset the covariates but by region.
for (region in levels(All$Region)) {
Region <- get(region)
for (variable in variables) {
for (level in levels(Region[[variable]])) {
name <- paste(region, variable, level, sep = "_")
subset <- Region[Region[[variable]] == level,]
assign(name, subset)
}
}
}
And of course, subsetting those same covariates but by center.
centres <- levels(All$Centre)
centres <- stringr::str_to_title(centres)
for (centre in centres) {
Centre <- get(centre)
for (variable in variables) {
for (level in levels(All[[variable]])) {
name <- paste(centre, variable, level, sep = "_")
subset <- Centre[Centre[[variable]] == level,]
assign(name, subset)
}
}
}
Analysis
Main effects
First, we select the datasets into which we want to perform the analysis and then all covariates we want to control for.
DATASETS <- c("All", "N.Eur", "Spain")
variables <- c("Sex", "E4status", "Age82", "Centre")
Then we reorder the datasets alphabetically and set Santander as the reference level in the Spanish instead of Madrid as the former has a higher N and also making sure the analysis are performed with respect to the Control reference level not the Diagnosed ones.
for (i in seq_along(DATASETS)) {
assign(DATASETS[i], get(DATASETS[i])[,order(names(get(DATASETS[i])))])
}
Spain$Centre <- relevel(Spain$Centre, ref = 6)
All$Diag <- relevel(All$Diag, ref = "Control")
Finally we perform the main effects analysis for the selected SNPs.
master_list <- perform_analysis(.mode = "main_effects", .data = DATASETS, snps = list_of_objects, covariates = variables)
This function outputs a list with the results of the glm contained in it (best model of inheritance and glm results). An example of the structure of the list with only the main effects performed can be seen here, consisting in the first snp of the All (N.Eur + Spain) dataset:
## $rs5749131
## $rs5749131$Gene
## [1] "TCN2"
##
## $rs5749131$Best_model
## rs5749131a
## "[1] AIC quasi= 7840.0857"
##
## $rs5749131$Main_effects
## $rs5749131$Main_effects$rs5749131a
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.662 0.151 1.938 1.441 2.606 1.205765e-05
## SexMale -0.521 0.059 0.594 0.529 0.667 1.486778e-18
## E4statusE4+ 1.124 0.059 3.077 2.742 3.452 6.303636e-80
## Age82>82 0.324 0.060 1.383 1.230 1.555 6.094967e-08
## CentreMADRID -0.977 0.168 0.376 0.271 0.523 6.295820e-09
## CentreNOTTINGHAM -0.401 0.175 0.670 0.475 0.944 2.194844e-02
## CentreOPTIMA -0.659 0.168 0.517 0.372 0.719 8.641606e-05
## CentreOVIEDO 0.348 0.189 1.416 0.978 2.052 6.581899e-02
## CentreSANTANDER -0.116 0.184 0.890 0.621 1.277 5.272912e-01
## CentreROTTERDAM -2.434 0.142 0.088 0.066 0.116 2.672129e-64
## rs5749131a1 -0.123 0.063 0.884 0.781 1.000 5.056580e-02
## rs5749131a2 -0.237 0.083 0.789 0.671 0.928 4.239103e-03
##
## $rs5749131$Main_effects$rs5749131d
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.660 0.151 1.935 1.439 2.601 1.253322e-05
## SexMale -0.522 0.059 0.594 0.529 0.666 1.349425e-18
## E4statusE4+ 1.124 0.059 3.077 2.742 3.451 4.833744e-80
## Age82>82 0.325 0.060 1.384 1.231 1.556 5.622751e-08
## CentreMADRID -0.973 0.168 0.378 0.272 0.525 7.167167e-09
## CentreNOTTINGHAM -0.399 0.175 0.671 0.476 0.945 2.241794e-02
## CentreOPTIMA -0.656 0.168 0.519 0.374 0.721 9.325110e-05
## CentreOVIEDO 0.349 0.189 1.417 0.978 2.053 6.526792e-02
## CentreSANTANDER -0.114 0.184 0.892 0.622 1.279 5.348744e-01
## CentreROTTERDAM -2.433 0.142 0.088 0.066 0.116 2.710091e-64
## rs5749131d1 -0.154 0.060 0.857 0.763 0.964 9.900838e-03
##
## $rs5749131$Main_effects$rs5749131r
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.592 0.147 1.807 1.356 2.409 5.525052e-05
## SexMale -0.521 0.059 0.594 0.529 0.667 1.459895e-18
## E4statusE4+ 1.123 0.059 3.073 2.739 3.447 8.260309e-80
## Age82>82 0.324 0.060 1.382 1.229 1.554 6.461236e-08
## CentreMADRID -0.983 0.168 0.374 0.269 0.520 5.101118e-09
## CentreNOTTINGHAM -0.404 0.175 0.667 0.474 0.940 2.081654e-02
## CentreOPTIMA -0.667 0.168 0.513 0.369 0.713 7.047623e-05
## CentreOVIEDO 0.347 0.189 1.414 0.976 2.049 6.696676e-02
## CentreSANTANDER -0.125 0.184 0.883 0.616 1.265 4.966989e-01
## CentreROTTERDAM -2.436 0.142 0.088 0.066 0.116 1.803317e-64
## rs5749131r1 -0.163 0.074 0.850 0.735 0.982 2.726669e-02
We can appreciate that the main effects analysis were also performed for the Spanish and Northern European subsets.
names(master_list)
## [1] "All" "N.Eur" "Spain"
Then, selecting a threshold of 0.05, before applying multiple testing correction, we can obtain the following results for the snps under the threshold and to which genes it belongs.
print_significant_results(master_list)
##
## rs5749131a2 rs5753231r1 rs1131603d1 rs5997711r1 rs11030102d1 rs11030119d1
## 0.004 0.046 0.049 0.010 0.001 0.032
## rs12288512d1 rs429358a1 rs429358a2 rs7412d1 rs4945261d1 rs2373115d1
## 0.014 0.000 0.000 0.000 0.025 0.030
## rs2695121d1 rs1052533r1 rs744373a1 rs744373a2
## 0.041 0.038 0.042 0.000
##
##
## sig_genes
## TCN2 APOE BDNF BIN1 GAB2 NR1H2
## 4 3 3 2 2 2
print_significant_results(master_list, corrected = T)
##
## rs11030102d1 rs429358a1 rs429358a2 rs7412d1 rs744373a2
## 0.037 0.001 0.000 0.001 0.002
##
##
## sig_genes
## APOE BDNF BIN1
## 3 1 1
Here, we can see that both additive levels for snp rs429358 remain significant, so what we really have are 4 snp models that are found significantly associated with AD, even after multiple testing correction.
Those are the APOE gene, for which has the E4 variant was found to be the largest known genetic risk factor for late-onset sporadic AD. The rationale is that this isoform APOE-ε4 is not as effective as the others at promoting these reactions, resulting in increased vulnerability to AD in individuals with that gene variation
The BDNF snp (rs11030102) has been linked to decreased BDNF serum levels. In the adult brain, BDNF maintains high expression levels and regulates both excitatory and inhibitory synaptic transmission and activity-dependent plasticity (Miranda et al., 2019)
The bridging integrator 1 (BIN1) gene is the second most important susceptibility gene for late-onset Alzheimer’s disease (LOAD) after apolipoprotein E (APOE) gene. (Hu et al., 2021). BIN1 encodes a Myc-interacting protein but its role in AD is still unclear.
To see a count of the number of genes studied, according to the number of snps we can use the following code:
genes_studied <- sapply(master_list$All, function(snp) {snp$Gene})
genes_studied <- unname(genes_studied)
genes_studied <- table(genes_studied)
sort(genes_studied, decreasing = T)
## genes_studied
## TCN2 MTRR SORT1 BDNF SHMT1 RFC1 APOE ABCA1 CYP46A1 DBH
## 11 10 6 5 5 4 3 2 2 2
## GAB2 MAPT NPC1 NR1H2 NTRK2 PEMT VDR BIN1 CD33 CDK5
## 2 2 2 2 2 2 2 1 1 1
## HMDX1 HMGCR IL10 LRP1 MTHFD1L PRNP SLC19A1 TRAK2
## 1 1 1 1 1 1 1 1
Here we can see that from the 75 SNPs studied, all 3 APOE snps were found significant as well as the only SNP studied for BIN1. For BDNF, though just one out of five SNPs was deemed significant according to main effects, snp rs11030102.
To measure the strength of the association found we can extract the OR from the main effects results.
significant_snps <- c("rs11030102d1","rs429358a1", "rs429358a2", "rs7412d1","rs744373a2")
OR_results <- sapply(significant_snps, function(snp_model_lvl){
snp_model <- sub("[0-2]$", "", snp_model_lvl);
snp <- sub("[a-z]$", "", snp_model);
glm_results <- master_list$All[[snp]]$Main_effects[[snp_model]]
index <- grep(rownames(glm_results), pattern = snp_model_lvl)
glm_results[index,][3]
})
snps <- sub(names(OR_results), pattern = "[a-z][0-2].OR$", replacement = "")
genes <- sapply(snps, function(snp) {list_of_objects[[snp]]$gene})
OR_results2 <- OR_results
names(OR_results2) <- genes
sort(OR_results, decreasing = T)
## rs429358a2.OR rs429358a1.OR rs744373a2.OR rs11030102d1.OR rs7412d1.OR
## 8.526 1.777 1.491 1.203 0.681
sort(OR_results2, decreasing = T)
## APOE APOE BIN1 BDNF APOE
## 8.526 1.777 1.491 1.203 0.681
Thus, we can see that the additive model for snp rs429358, which corresponds to the APOE gene increases the odds of having 1.7 times with respect no not having the variant allele, while the full dosage increases it 8 times.
On the other hand, the others, hold more moderate effects, with the dominant model of snp rs7412d, which also corresponds to APOE, increases the odds of having the disease 0.682. Or what is the same, decreases it by 1/0.682 = 1.4662757
We can generate a dataframe to plot these results:
plot_df <- data.frame(matrix(NA, nrow = length(significant_snps), ncol = 6))
colnames(plot_df) <- c("snp", "gene", "OR", "lower", "upper", "pvalue")
for (i in seq_along(significant_snps)) {
snp_model_lvl <- significant_snps[i]
snp_model <- sub(snp_model_lvl, pattern = "[0-2]$", replacement = "")
snp <- sub(snp_model, pattern = "[a-z]$", replacement = "")
SNP <- master_list$All[[snp]]
gene <- SNP$Gene
main_effects <- SNP$Main_effects[[snp_model]]
snp_term <- grep(pattern = snp_model_lvl, x = rownames(main_effects))
OR <- main_effects[snp_term, 3]
lower <- main_effects[snp_term, 4]
upper <- main_effects[snp_term, 5]
pvalue <- main_effects[snp_term, 6]
plot_df[i,1] <- snp_model_lvl
plot_df[i,2] <- gene
plot_df[i,3] <- OR
plot_df[i,4] <- lower
plot_df[i,5] <- upper
plot_df[i,6] <- pvalue
}
And plot them:
## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.

Covariate interaction
The next step in the analysis would be to examine all possible interactions of the 75 studied SNPs with the covariates Age82, Sex & E4status.
In practice this means repeating the models from the main effects but by adding one interaction term at a time.
In order to reduce the multiple testing performed and due to the previous findings of what the best models were, only the best snp models chosen for each main effect were employed in this subsequent step.
master_list <- perform_analysis(.mode = "interaction", .data = DATASETS, snps = list_of_objects, covariates = variables)
And here we can see an example of the kind of results obtained by doing this analysis.
master_list$All[[1]]$Interactions$Other_covariates
## $Age75
## $Age75$Name
## NULL
##
## $Age75$Summary
## NULL
##
## $Age75$SF
## NULL
##
## $Age75$Significant
## NULL
##
##
## $Sex
## $Sex$Name
## [1] "rs5749131a1:SexMale" "rs5749131a2:SexMale"
##
## $Sex$Summary
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.669 0.154 1.953 1.443 2.644 1.473689e-05
## E4statusE4+ 1.124 0.059 3.076 2.742 3.451 6.757858e-80
## Age82>82 0.325 0.060 1.384 1.231 1.557 5.580781e-08
## CentreMADRID -0.980 0.168 0.375 0.270 0.522 5.618590e-09
## CentreNOTTINGHAM -0.406 0.175 0.666 0.473 0.939 2.036191e-02
## CentreOPTIMA -0.664 0.168 0.515 0.370 0.715 7.574688e-05
## CentreOVIEDO 0.343 0.189 1.410 0.973 2.042 6.971030e-02
## CentreSANTANDER -0.118 0.184 0.888 0.620 1.274 5.202003e-01
## CentreROTTERDAM -2.438 0.142 0.087 0.066 0.115 1.603023e-64
## rs5749131a1 -0.103 0.080 0.902 0.771 1.055 1.966077e-01
## rs5749131a2 -0.318 0.106 0.728 0.591 0.896 2.805680e-03
## SexMale -0.532 0.101 0.587 0.482 0.716 1.505749e-07
## rs5749131a1:SexMale -0.053 0.130 0.949 0.735 1.224 6.850088e-01
## rs5749131a2:SexMale 0.206 0.170 1.229 0.881 1.713 2.242916e-01
##
## $Sex$SF
## [1] 0.949 1.229
##
## $Sex$Significant
## [1] FALSE FALSE
##
##
## $E4status
## $E4status$Name
## [1] "rs5749131a1:E4statusE4+" "rs5749131a2:E4statusE4+"
##
## $E4status$Summary
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.684 0.154 1.983 1.465 2.684 9.529236e-06
## SexMale -0.520 0.059 0.594 0.529 0.667 1.791717e-18
## Age82>82 0.324 0.060 1.382 1.229 1.554 6.619788e-08
## CentreMADRID -0.980 0.168 0.375 0.270 0.522 5.775270e-09
## CentreNOTTINGHAM -0.404 0.175 0.668 0.474 0.941 2.096352e-02
## CentreOPTIMA -0.663 0.168 0.515 0.371 0.716 7.787719e-05
## CentreOVIEDO 0.345 0.189 1.412 0.975 2.046 6.825821e-02
## CentreSANTANDER -0.115 0.184 0.892 0.622 1.278 5.324574e-01
## CentreROTTERDAM -2.436 0.142 0.088 0.066 0.116 1.938331e-64
## rs5749131a1 -0.176 0.080 0.839 0.718 0.981 2.740924e-02
## rs5749131a2 -0.210 0.104 0.811 0.661 0.995 4.463333e-02
## E4statusE4+ 1.067 0.102 2.907 2.382 3.548 1.315826e-25
## rs5749131a1:E4statusE4+ 0.139 0.131 1.149 0.890 1.485 2.862701e-01
## rs5749131a2:E4statusE4+ -0.068 0.171 0.934 0.668 1.305 6.891326e-01
##
## $E4status$SF
## [1] 1.149 0.934
##
## $E4status$Significant
## [1] FALSE FALSE
##
##
## $Age82
## $Age82$Name
## [1] "rs5749131a1:Age82>82" "rs5749131a2:Age82>82"
##
## $Age82$Summary
## Estimate Std. Error OR lower upper Pr(>|t|)
## (Intercept) 0.690 0.156 1.993 1.467 2.708 1.052890e-05
## SexMale -0.523 0.059 0.593 0.528 0.665 1.127698e-18
## E4statusE4+ 1.125 0.059 3.079 2.744 3.455 6.524475e-80
## CentreMADRID -0.982 0.168 0.375 0.270 0.521 5.529357e-09
## CentreNOTTINGHAM -0.406 0.175 0.666 0.473 0.939 2.053118e-02
## CentreOPTIMA -0.665 0.168 0.514 0.370 0.715 7.551328e-05
## CentreOVIEDO 0.341 0.189 1.406 0.970 2.038 7.184415e-02
## CentreSANTANDER -0.119 0.184 0.888 0.619 1.274 5.190076e-01
## CentreROTTERDAM -2.441 0.143 0.087 0.066 0.115 1.541780e-64
## rs5749131a1 -0.196 0.092 0.822 0.686 0.984 3.271840e-02
## rs5749131a2 -0.157 0.119 0.855 0.677 1.079 1.867526e-01
## Age82>82 0.285 0.099 1.329 1.094 1.615 4.248406e-03
## rs5749131a1:Age82>82 0.138 0.126 1.148 0.896 1.471 2.748867e-01
## rs5749131a2:Age82>82 -0.157 0.166 0.855 0.617 1.184 3.455101e-01
##
## $Age82$SF
## [1] 1.148 0.855
##
## $Age82$Significant
## [1] FALSE FALSE
First, for the variable “Name” we have the interaction studied, then as “Summary” the full glm model results obtained, and the rest of variables, extract the most interesting results for the interaction, the “SF” and whether it is found “Significant” or not.
The SF is a statistic used to measure interactions in complex diseases. It is defined as the OR of the interaction divided by the product of the OR of each of the interaction on its own.
Thus, it makes a good and easy to understand metric for what we are measuring as if there was no interaction effect we would expect the SF to be close to 1, as the impact of the interaction should be about the same as the effect of each of the members on its on. However, if the interaction is found greater or lower than 1, that means we are having a synergistic effect in which the odds on having the disease is increased or reduced respectively.
Having said that, we can obtain the significant results found for this snp-covariate interaction measured: